'''
#==========================================================================================#
#
# Word Embeddings for the Analysis of Ideological Placement in Parliamentary Corpora
# Political Analysis
# Ludovic Rheault and Christopher Cochrane
#
#===========================================================================================#

This file reproduces the results printed in the main text of the paper.  We recommend using a Python virtual environment to avoid compatibility issues with the libraries needed, which are listed in the requirements.txt file.  Below is a summary of the key steps to set up a virtual environment and run the present file.  

We recommend using a version of Python 3.5 or above.  Note that for some operating systems, the python3 and pip3 commands may be needed below if Python 3 is not the default language associated with the python command.

Steps:

#1. After downloading and extracting the dataverse repository files, enter the replication folder.
unzip dataverse_files.zip
cd dataverse_files

#2a. Install the Python virtualenv package (if needed).
pip install virtualenv

#2b. Create and activate a virtual environment. 
virtualenv partyembed
source partyembed/bin/activate

#(On Windows: partyembed\Scripts\activate)

#3. Install requirements.
pip install -r requirements.txt

#4. Run the main results script (note, figures will be overwritten in the current directory).
python main_results.py

#5. To reproduce the results from the online appendix.
python appendix_results.py

#6. Deactivate the virtual environment.
deactivate

#7. Polarization figures (must be run after appendix_results.py).
Rscipt polarization.R
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.decomposition import PCA
from gensim.models.doc2vec import Doc2Vec
import utils.labels as labels
import utils.plots as plots
from utils.guided import *
from utils.interpret import Interpret
from utils.accuracy import pairwise_accuracy, analogies, word_similarities, hyperparameter_tables
from utils.polarization import polarization_metric

#=========================================================================================#
# Table A2: Most Common Phrases
#=========================================================================================#
# Loading the four main models

mdsen = Doc2Vec.load('models/senate200')
mdhouse = Doc2Vec.load('models/house200')
mdcan = Doc2Vec.load('models/canada200')
mduk = Doc2Vec.load('models/uk200')

# Counting phrases
usa_phrases = [(w, wobj.count + mdsen.wv.vocab[w].count) for w, wobj in mdhouse.wv.vocab.items() if '_' in w and w in mdsen.wv.vocab]
can_phrases = [(w, wobj.count) for w, wobj in mdcan.wv.vocab.items() if '_' in w]
uk_phrases = [(w, wobj.count) for w, wobj in mduk.wv.vocab.items() if '_' in w]
usa_phrases = sorted(usa_phrases, key=lambda x: x[1], reverse=True)[0:20]
can_phrases = sorted(can_phrases, key=lambda x: x[1], reverse=True)[0:20]
uk_phrases = sorted(uk_phrases, key=lambda x: x[1], reverse=True)[0:20]

phrases = pd.DataFrame( {'USA Phrases': [w for w,c in usa_phrases],
                         'USA Count': [c for w,c in usa_phrases],
                         'Canada Phrases': [w for w,c in can_phrases],
                         'Canada Count': [c for w,c in can_phrases],
                         'Britain Phrases': [w for w,c in uk_phrases],
                         'Britain Count': [c for w,c in uk_phrases] },
                        columns=['USA Phrases', 'USA Count', 'Canada Phrases', 'Canada Count', 'Britain Phrases', 'Britain Count'])

with open('tables/tableA2.txt', 'w') as f:
    print("Table A2: Most Common Phrases\n"+"-"*120, file=f)
    print(tabulate(phrases, headers="keys", showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table A2 to file tables/tableA2.txt")

#=========================================================================================#
# Figure A1: Party Placement in the US Senate (1873-2016)
#=========================================================================================#

# Loading label names:
label_dict = labels.party_labels('USA')
fullnames, parties, cols, mkers = labels.party_tags(mdsen,'USA')
labs = [label_dict[p] for p in parties]
M = mdsen.vector_size; P = len(parties)

# Fitting PCA dimensionality reduction model for plotting:
z = np.zeros((P,M))
for i in range(P):
    z[i,:] = mdsen.docvecs[parties[i]]
pca = PCA(n_components = 2)
Z = pd.DataFrame(pca.fit_transform(z), columns=['dim1','dim2'])
Z['label'] = labs

# Re-orienting the axes for substantive interpretation:
rev1 = False; rev2 = False;
if Z[Z.label=='Dem 2015'].dim1.values[0] > Z[Z.label=='Rep 2015'].dim1.values[0]:
    Z['dim1'] = Z.dim1 * (-1)
    rev1 = True
if Z[Z.label=='Dem 2015'].dim2.values[0] < Z[Z.label=='Rep 2015'].dim2.values[0]:
    Z['dim2'] = Z.dim2 * (-1)
    rev2 = True

# Reproducing Figure A1
# A1a
plots.plot_2a(Z, labs, cols, mkers, savepath='figures/figureA1a.pdf')
print("Saved Figure A1a to file figureA1a.pdf")
# A1b
plots.plot_timeseries(Z, fullnames, cols, dimension=1, savepath='figures/figureA1b.pdf', legend='upper left')
print("Saved Figure A1b to file figureA1b.pdf")
# A1c
plots.plot_timeseries(Z, fullnames, cols, dimension=2, savepath='figures/figureA1c.pdf', legend='upper left')
print("Saved Figure A1c to file figureA1c.pdf")

#===============================================================================================#
# Table A3: Words and Phrases Most Similar to Democrats and Republicans in the 114th Congress
#===============================================================================================#

with open('tables/tableA3.txt', 'w') as f:
    print("Words Most Similar to House Democrats - 114th Congress", file=f)
    print(tabulate(mdhouse.wv.similar_by_vector(mdhouse.docvecs['D_114'], topn=20, restrict_vocab=20000), floatfmt=".3f"), file=f)
    print("Words Most Similar to House Republicans - 114th Congress", file=f)
    print(tabulate(mdhouse.wv.similar_by_vector(mdhouse.docvecs['R_114'], topn=20, restrict_vocab=20000), floatfmt=".3f"), file=f)
print("Saved Table A3 to file tables/tableA3.txt")

#===============================================================================================#
# Figure A2: Party Placement in a 2D Space using Customized Ideological Axes (Britain)
#===============================================================================================#
# Loading UK labels
uk_dict = labels.party_labels('UK')
uknames, ukparties, ukcols, ukmkers = labels.party_tags(mduk, 'UK')
uklabs = [uk_dict[p] for p in ukparties]
Muk = mduk.vector_size; Puk = len(ukparties)

# Fitting custom dimensionality reduction model for plotting:
zuk = np.zeros((Puk,Muk))
for i in range(Puk):
    zuk[i,:] = mduk.docvecs[ukparties[i]]
Zuk = pd.DataFrame(custom_projection_2D(zuk, mduk))
Zuk.columns = ['dim1', 'dim2']
Zuk['label'] = uklabs

plots.plot_A2(Zuk, ukcols, ukmkers, savepath='figures/figureA2.pdf')
print("Saved Figure A2 to file figureA2.pdf")

#=========================================================================================#
# Table A4: Accuracy of Guided Party Placement against Gold Standard
#=========================================================================================#
# Computing guided projections:
# UK
Zuk = pd.DataFrame(custom_projection_1D(zuk, mduk), columns = ['dim1'])
Zuk['label'] = uklabs
# Canada
can_dict = labels.party_labels('Canada')
cannames, canparties, cancols, canmkers = labels.party_tags(mdcan, 'Canada')
canlabs = [can_dict[p] for p in canparties]
Mcan = mdcan.vector_size; Pcan = len(canparties)
zcan = np.zeros((Pcan, Mcan))
for i in range(Pcan):
    zcan[i,:] = mdcan.docvecs[canparties[i]]
Zcan = pd.DataFrame(custom_projection_1D(zcan, mdcan), columns = ['dim1'])
Zcan['label'] = canlabs
# US House
zhouse = np.zeros((P, M))
for i in range(P):
    zhouse[i,:] = mdhouse.docvecs[parties[i]]
Zhouse = pd.DataFrame(custom_projection_1D(zhouse, mdhouse), columns = ['dim1'])
Zhouse['label'] = labs
# US Senate
Zsen = pd.DataFrame(custom_projection_1D(z, mdsen), columns = ['dim1'])
Zsen['label'] = labs

# Joining projections with external gold standards
gold_house = pd.read_csv('data/goldstandard_house.csv').merge(Zhouse, on='label', how='left')
gold_senate = pd.read_csv('data/goldstandard_senate.csv').merge(Zsen, on='label', how='left')
gold_uk = pd.read_csv('data/goldstandard_uk.csv').merge(Zuk, on='label', how='left')
gold_can = pd.read_csv('data/goldstandard_canada.csv').merge(Zcan, on='label', how='left')

gold_scores = ['voteview', 'rile', 'vanilla', 'legacy']
countries = [('US House', gold_house),
             ('US Senate', gold_senate),
             ('Canada', gold_can),
             ('Britain', gold_uk)]
results = np.zeros(( 8, 4 ), dtype=object)

for idx, (c, df) in enumerate(countries):
    jdx = 0
    for g in gold_scores:
        if g=='voteview' and 'voteview' not in df.columns:
            results[jdx:(jdx+2),idx] = ['','']
        else:
            temp = df[pd.notnull(df[g])]
            corr = '%0.3f' %temp.dim1.corr(temp[g])
            acc = '%0.2f%%' %pairwise_accuracy(temp[g].tolist(), temp.dim1.tolist())
            results[jdx:(jdx+2),idx] = [corr, acc]
        jdx += 2

results = pd.DataFrame(results, columns = [c for c,df in countries])
results.insert(loc=0,column='Metric',value=['Correlation', 'Accuracy']*4)
results.insert(loc=0,column='Gold Standard',value=[item for item in ['Voteview', 'rile', 'vanilla', 'legacy'] for i in range(2)])

with open('tables/tableA4.txt', 'w') as f:
    print("Table A4: Accuracy of Guided Party Placement against Gold Standard\n"+"-"*83, file=f)
    print(tabulate(results, headers="keys", showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table A4 to file tables/tableA4.txt")

#=========================================================================================#
# Table A5: Words and Phrases for Guided Ideological Placement
#=========================================================================================#
with open('tables/tableA5.txt', 'w') as f:
    print("Table A5: Words and Phrases for Guided Ideological Placement\n"+'-'*100, file=f)
    poles = ['Economic Left', 'Economic Right', 'Social Left', 'Social Right']
    for idx, cat in enumerate(BASE_LEXICON):
        print(poles[idx], file=f)
        print('-'*100, file=f)
        for words in (cat[pos:pos+6] for pos in range(0, len(cat), 6)):
            print(', '.join(words), file=f)
        print('-'*100, file=f)
print("Saved Table A5 to file tables/tableA5.txt")

#=========================================================================================#
# Figure A3: Party Polarization in Britain, Canada, and the United States (1935-2015)
#=========================================================================================#
# Creating polarization datasets
polarization_metric(mduk, country='UK').to_csv('data/uk_polar.csv',index=False)
polarization_metric(mdhouse, country='USA').to_csv('data/us_polar.csv',index=False)
polarization_metric(mdcan, country='Canada').to_csv('data/can_polar.csv',index=False)
# The figures were created with the separate script polarization.R
# Run: Rscript polarization.R

#=========================================================================================#
# Table A6: Effect of Layer Size on Accuracy
#=========================================================================================#
# Note: The trained models for Tables A6 and A9-A11 represent approximately 10GB of data. 
# For simplicity, the PCA projections are combined and provided in the data/ directory.
hyperparameter_tables('layer_size')
print("Saved Table A6 to file tables/tableA6.txt")

#=========================================================================================#
# Table A7: Word Embedding Accuracy - Analogy Tests
#=========================================================================================#
results = np.zeros(( 15, 5 ), dtype=object)
for idx, country in enumerate([mdhouse, mdsen, mduk, mdcan]):
    names, scores = analogies(country)
    results[:,idx+1] = scores
results[:,0] = names
with open('tables/tableA7.txt', 'w') as f: 
    print("Table A7: Word Embedding Accuracy - Analogy Tests", file=f)
    print('-'*110, file=f)
    print(tabulate(results, headers=['US House','US Senate','Britain','Canada'], showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table A7 to file tables/tableA7.txt")

#=========================================================================================#
# Table A8: Word Embedding Accuracy - Word Similarity Tests
#=========================================================================================#
results = np.zeros(( 2, 5 ), dtype=object)
for idx, country in enumerate([mdhouse, mdsen, mduk, mdcan]):
    pearson, spearman = word_similarities(country)
    results[0,idx+1] = pearson
    results[1,idx+1] = spearman
results[:,0] = ['Pearson','Spearman']
with open('tables/tableA8.txt', 'w') as f: 
    print("Table A8: Word Embedding Accuracy - Word Similarity Tests", file=f)
    print('-'*110, file=f)
    print(tabulate(results, headers=['US House','US Senate','Britain','Canada'], showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table A8 to file tables/tableA8.txt")

#=========================================================================================#
# Table A9-A11: Extended Accuracy Results
#=========================================================================================#
# Note: The trained models for Tables A6 and A9-A11 represent approximately 10GB of data. 
# For simplicity, the PCA projections are combined and provided in the data/ directory.
hyperparameter_tables('Part1')
print("Saved Table A9 to file tables/tableA9.txt")
hyperparameter_tables('Part2')
print("Saved Table A10 to file tables/tableA10.txt")
hyperparameter_tables('Part3')
print("Saved Table A11 to file tables/tableA11.txt")
